1 Pseudobulk and UpSet Plots

#load integrated object
load(file = "forebrain_all_final.RObj")

##' Create a pseudoBulk from a single-cell expression matrix
##'
##' Create a pseudoBulk from a single-cell expression matrix
##'
##' @param seurat.obj Integrated seurat object 
##' @param clusters Vector or factor along the cells giving cluster memberships.
##' Must have length(clusters) == ncol(n). If a factor then unused levels will be
##' dropped; if not a factor then will be converted to one.
##' @param rowAggregator Default rowSum. Or some other function that accepts a matrix
##' and returns a numeric vector along the rows giving some "bulk summary" of the expression value.
##' @return Matrix of features x clusters
##' @author Shristi Pandey
##' @export
##' 
pseudoBulk <- function(seurat.obj, clusters, rowAggregator = rowSums)  {
  #n <- as.matrix(GetAssayData(seurat.obj, slot = 'counts'))
  n <- GetAssayData(seurat.obj, slot = 'counts')
  stopifnot(length(clusters) == ncol(n))
  
  ## drop unused levels if it IS a factor, otherwise make it a factor
  clusters <- factor(clusters)  
  
  names(clusterLevels) <- clusterLevels <- levels(clusters)
  
  psb <- sapply(clusterLevels, function(cl)  {
    #.richMessage("... collapsing ", cl, caller = "pseudoBulk")
    i <- clusters == cl
    vec <- rowAggregator(n[,i, drop = FALSE])
    stopifnot(names(vec) == rownames(n))  ## double check we are preserving rownames
    vec
  })
  
  return(psb)
}


##' Given a seurat object, returns a set up summarized object to run pseudobulk with voom
##'
##' @param seurat.obj Seurat Object 
##' @param treatment string representing the genotype/timepoint or other treatment type
##' @param sample_field String representing the sample metadata
##' @param celltype_field  String representing the cluster field in the seurat object
##' @param study_field String representing the study field if the object represents an integration of multiple studies 
##' @param cluster_prop Proportion of the cluster in the total dataset expressed in percent. For creating different psbs for different cluster sizes. To modulate the minimum
##' @author Shristi Pandey
##' @return Summarized experiment object with appropriate 
##' 

setup_se_psb <- function(seurat.obj, treatment_field=NULL, sample_field = 'sampleID', celltype_field = NULL, study_field = 'orig.ident'){
  library(Seurat)
  library(dplyr)
  library(edgeR)
  library(SummarizedExperiment)
  DefaultAssay(seurat.obj) <- 'RNA'  # important when you are performing experiment with integrated object 
  if (is.null(treatment_field)){
    stop('No treatment field provided')
  }
  treatment = unlist(seurat.obj[[treatment_field]])
  if (is.null(sample_field)){
    stop('No sample field provided')
  }
  samples = unlist(seurat.obj[[sample_field]])
  if (!is.null(celltype_field)){
    celltype = unlist(seurat.obj[[celltype_field]])
  }
  if (is.null(celltype_field)){
    pseudobulkID <- paste0(treatment,'_',samples)
  } else {
    pseudobulkID <- paste0(treatment,'_',samples, '_', celltype)
  }
  
  seurat.obj$pseudobulkID <- pseudobulkID
  psb_clusters <- factor(seurat.obj$pseudobulkID)
  numcells <- table(seurat.obj$pseudobulkID)
  #check order of the metadata to pseudobulk order 
  psb <- pseudoBulk(seurat.obj, psb_clusters, rowAggregator = rowSums)
  
  if (is.null(celltype_field)) {
    col_data <- data.frame(seurat.obj$pseudobulkID, seurat.obj[[treatment_field]], seurat.obj[[sample_field]], seurat.obj[[study_field]])
    colnames(col_data) <- c('pseudobulkID', 'treatment', 'batch', 'study')
  } else{
    col_data <- data.frame(seurat.obj$pseudobulkID, seurat.obj[[treatment_field]], seurat.obj[[sample_field]], seurat.obj[[celltype_field]], seurat.obj[[study_field]])
    colnames(col_data) <- c('pseudobulkID', 'treatment', 'batch', 'celltype', 'study')
  }
  col_data <- unique(col_data)
  rownames(col_data) <- col_data$pseudobulkID
  col_data <- col_data[colnames(psb),]
  col_data$nCells <- numcells[rownames(col_data)]
  if(!all(rownames(col_data) == colnames(psb))) {
    stop('Rownames of coldata do not match with the colnames of the assay matrix. Check')
  }
  
  norm_counts <- cpm(psb, log=FALSE)
  norm_counts <- log2(norm_counts+1)
  se <- SummarizedExperiment(assays = list(counts = psb, normalized = norm_counts), colData = col_data)
  
  return (se)
}



##' Sets up and runs voom 
##'
##' @param se Summarized experiment
##' @param groups slot containing the groups that you would like to compare
##' @param comparisons comparisons that are of interest; used to create contrasts
##' @return results for all comparisons listed in comparisons
##' @author Shristi Pandey
##' @export
##' 
setup_and_runVoom <- function(se = NULL, groups = NULL, comparisons = NULL){
  
  library(edgeR)
  library(dplyr)
  d0 <- DGEList(assay(se, 1))
  d0$genes <- data.frame(origin=seq_len(nrow(se1)))
  d0 <- calcNormFactors(d0)
  

  meta <- colData(se) %>% as.data.frame()
  data <- subset(meta, select = c('treatment'))
  #extract out the treatment col 
  design <- model.matrix(~0+treatment, data = data)

  filtered <- filterByExpr(d0, design=design, large.n=0)
  summary(filtered)
  d0 <- d0[filtered,]
  d0 <- calcNormFactors(d0)
  head(d0$samples, 10)

  library(limma)
  v <- voomWithQualityWeights(d0, design=design, plot=TRUE)
  fit <- lmFit(v)
  fit <- eBayes(fit, robust=TRUE)
  plotSA(fit)
  summary(fit$df.prior)
  
  all_results <- List()
  
  #create contrasts for the first comparison: 
  con <- integer(ncol(design))
  names(con) <- colnames(design)
  con[c("treatmentAdult", "treatment15dpf")] <- c(1L, -1L)
  fit2 <- contrasts.fit(fit, contrasts = con)
  fit2 <- eBayes(fit2, robust=TRUE)
  
  res <- topTable(fit2, n=Inf, sort.by="none")
  head(res[order(res$adj.P.Val),])
  summary(decideTests(fit2))
  summary(res$adj.P.Val <= 0.05)
  plotMD(fit2, status=decideTests(fit2), hl.cex=0.5)
  volcanoplot(fit2)
  res$Comparison <- "Difference between `Adult` vs `15dpf`"
  all_results[["Difference between `Adult` vs `15dpf`"]] <- res
  
  
  #create contrasts for the Adult vs. 6dpf comparison and shove it to all_results: 
  con <- integer(ncol(design))
  names(con) <- colnames(design)
  con[c("treatmentAdult", "treatment6dpf")] <- c(1L, -1L)
  fit2 <- contrasts.fit(fit, contrasts = con)
  fit2 <- eBayes(fit2, robust=TRUE)
  
  res <- topTable(fit2, n=Inf, sort.by="none")
  head(res[order(res$adj.P.Val),])
  summary(decideTests(fit2))
  summary(res$adj.P.Val <= 0.05)
  plotMD(fit2, status=decideTests(fit2), hl.cex=0.5)
  volcanoplot(fit2)
  res$Comparison <- 'Difference between Adult vs. 6dpf'
  all_results[["Difference between `Adult` vs `6dpf`"]] <- res

  
  
  #create contrasts for the 15dpf vs. 6dpf comparison and shove it to all_results: 
  con <- integer(ncol(design))
  names(con) <- colnames(design)
  con[c("treatment15dpf", "treatment6dpf")] <- c(1L, -1L)
  fit2 <- contrasts.fit(fit, contrasts = con)
  fit2 <- eBayes(fit2, robust=TRUE)
  
  res <- topTable(fit2, n=Inf, sort.by="none")
  
  head(res[order(res$adj.P.Val),])
  summary(decideTests(fit2))
  summary(res$adj.P.Val <= 0.05)
  plotMD(fit2, status=decideTests(fit2), hl.cex=0.5)
  volcanoplot(fit2)
  res$Comparison <- "Difference between `15dpf` vs `6dpf`"
  all_results[["Difference between `15dpf` vs `6dpf`"]] <- res
  
  return(all_results)
  
}

forebrain.integrated$study <- 'sp'
se <- setup_se_psb(seurat.obj=forebrain.integrated, treatment_field='age', celltype_field='clusterInterp2', sample_field = 'batch', study_field = 'study')
se <- se[, se$nCells > 50]
#not including Pallium01 and subpallium02 because there are 0 samples at 6 and 15dpf
se <- se[, se$celltype %in% c('Pallium_02', 'Pallium_03', 'Pallium_04', 'Pallium_05', 'Pallium_06', 'Pallium_07', 'Pallium_08', 'Pallium_09', 
                              'Subpallium_02', 'Subpallium_03', 'Subpallium_04', 'Subpallium_05', 'Subpallium_06', 'Subpallium_07', 'Subpallium_08')]



#create results table for all comparisons
cols <- c("genes","origin", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B", 'Comparisons', 'Celltype')
adult_vs_15dpf <- setNames(data.frame(matrix(ncol = 10, nrow = 0)), c(cols))
adult_vs_6dpf <- setNames(data.frame(matrix(ncol = 10, nrow = 0)), c(cols))
a15dpf_vs_6dpf <- setNames(data.frame(matrix(ncol = 10, nrow = 0)), c(cols))


for (elem in unique(se$celltype)){
  se1 <- se[, se$celltype == elem]
  all_results <- setup_and_runVoom(se1,
                                   groups = 'treatment',
                                   comparisons = list(c('Adult', '15dpf'), c('Adult', '6dpf'), c('15dpf', '6dpf')))
  
  b <- as.data.frame(subset(all_results[[1]], abs(logFC) >= log2(2) & adj.P.Val <= 0.05))
  c <- as.data.frame(subset(all_results[[2]], abs(logFC) >= log2(2) & adj.P.Val <= 0.05))
  d <- as.data.frame(subset(all_results[[3]], abs(logFC) >= log2(2) & adj.P.Val <= 0.05))
  
  b$genes <- rownames(b)
  b$Comparisons <- 'Adult_vs_15dpf'
  b$Celltype <- elem
  adult_vs_15dpf <- merge(adult_vs_15dpf, b, all = T)
  
  c$genes <- rownames(c)
  c$Comparisons <- 'Adult_vs_6dpf'
  c$Celltype <- elem  
  adult_vs_6dpf <- merge(adult_vs_6dpf, c, all = T)
  
  d$genes <- rownames(d)
  d$Comparisons <- '15_vs_6dpf'
  d$Celltype <- elem  
  a15dpf_vs_6dpf <- merge(a15dpf_vs_6dpf, d, all = T)
  
}

write.csv(adult_vs_15dpf, file = '~/adult_Vs_15dpf.csv')
write.csv(adult_vs_6dpf, file = '~/adult_vs_6dpf.csv')
write.csv(a15dpf_vs_6dpf, file = '~/a15dpf_vs_6dpf.csv')

## create upset plots
library(UpSetR)
a15dpf_vs_6dpf <- read.csv("~/a15dpf_vs_6dpf.csv", row.names = 1)
adult_vs_15dpf <- read.csv("~/adult_Vs_15dpf.csv", row.names = 1)
adult_vs_6dpf <- read.csv("~/adult_vs_6dpf.csv", row.names = 1)

adult_vs_15dpf <- adult_vs_15dpf[adult_vs_15dpf$logFC >0, ]
a15dpf_vs_6dpf <- a15dpf_vs_6dpf[a15dpf_vs_6dpf$logFC >0, ]
adult_vs_6dpf <- adult_vs_6dpf[adult_vs_6dpf$logFC >0, ]

#adult vs 6
dat_list = split(adult_vs_6dpf, adult_vs_6dpf$Celltype)

upsetlist <- vector(mode = "list", length = length(dat_list))
names(upsetlist) <- names(dat_list)
for (i in 1:length(dat_list)){
  genes <- c()
  #print(names(dat_list[i]))
  genes <- dat_list[[i]]$genes
  #print(genes)
  upsetlist[[names(dat_list[i])]] =  genes
  
}
upset(fromList(upsetlist), order.by = "freq", nsets = length(upsetlist))

#6 v 15dpf
dat_list = split(a15dpf_vs_6dpf, a15dpf_vs_6dpf$Celltype)

upsetlist <- vector(mode = "list", length = length(dat_list))
names(upsetlist) <- names(dat_list)
for (i in 1:length(dat_list)){
  genes <- c()
  #print(names(dat_list[i]))
  genes <- dat_list[[i]]$genes
  #print(genes)
  upsetlist[[names(dat_list[i])]] =  genes
  
}
upset(fromList(upsetlist), order.by = "freq", nsets = length(upsetlist))

#adult vs 15dpf
dat_list = split(adult_vs_15dpf, adult_vs_15dpf$Celltype)

upsetlist <- vector(mode = "list", length = length(dat_list))
names(upsetlist) <- names(dat_list)
for (i in 1:length(dat_list)){
  genes <- c()
  #print(names(dat_list[i]))
  genes <- dat_list[[i]]$genes
  #print(genes)
  upsetlist[[names(dat_list[i])]] =  genes
  
}
upset(fromList(upsetlist), order.by = "freq", nsets = length(upsetlist))

2 Session info

Packages and versions necessary to reproduce the results in this report.

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux
## 
## Matrix products: default
## BLAS/LAPACK: /data/rc/apps/rc/software/FlexiBLAS/3.0.4-GCC-10.3.0/lib64/libflexiblas.so.3.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] UpSetR_1.4.0                SummarizedExperiment_1.28.0
##  [3] Biobase_2.58.0              GenomicRanges_1.50.2       
##  [5] GenomeInfoDb_1.34.9         IRanges_2.32.0             
##  [7] S4Vectors_0.36.2            BiocGenerics_0.44.0        
##  [9] MatrixGenerics_1.10.0       matrixStats_0.63.0         
## [11] edgeR_3.40.2                limma_3.54.2               
## [13] dplyr_1.1.1                 statmod_1.5.0              
## [15] SeuratObject_4.1.3          Seurat_4.3.0               
## [17] knitr_1.42                  tinytex_0.44               
## [19] rmarkdown_2.21             
## 
## loaded via a namespace (and not attached):
##   [1] plyr_1.8.8             igraph_1.4.1           lazyeval_0.2.2        
##   [4] sp_1.6-0               splines_4.2.0          listenv_0.9.0         
##   [7] scattermore_0.8        ggplot2_3.4.1          digest_0.6.31         
##  [10] htmltools_0.5.5        fansi_1.0.4            magrittr_2.0.3        
##  [13] tensor_1.5             cluster_2.1.4          ROCR_1.0-11           
##  [16] globals_0.16.2         spatstat.sparse_3.0-1  colorspace_2.1-0      
##  [19] ggrepel_0.9.3          xfun_0.38              RCurl_1.98-1.12       
##  [22] jsonlite_1.8.4         progressr_0.13.0       spatstat.data_3.0-1   
##  [25] survival_3.5-5         zoo_1.8-11             glue_1.6.2            
##  [28] polyclip_1.10-4        gtable_0.3.3           zlibbioc_1.44.0       
##  [31] XVector_0.38.0         leiden_0.4.3           DelayedArray_0.24.0   
##  [34] future.apply_1.10.0    abind_1.4-5            scales_1.2.1          
##  [37] DBI_1.1.3              spatstat.random_3.1-4  miniUI_0.1.1.1        
##  [40] Rcpp_1.0.10            viridisLite_0.4.1      xtable_1.8-4          
##  [43] reticulate_1.28        htmlwidgets_1.6.2      httr_1.4.5            
##  [46] RColorBrewer_1.1-3     ellipsis_0.3.2         ica_1.0-3             
##  [49] farver_2.1.1           pkgconfig_2.0.3        sass_0.4.5            
##  [52] uwot_0.1.14            deldir_1.0-6           locfit_1.5-9.7        
##  [55] utf8_1.2.3             tidyselect_1.2.0       labeling_0.4.2        
##  [58] rlang_1.1.0            reshape2_1.4.4         later_1.3.0           
##  [61] munsell_0.5.0          tools_4.2.0            cachem_1.0.7          
##  [64] cli_3.6.1              generics_0.1.3         ggridges_0.5.4        
##  [67] evaluate_0.20          stringr_1.5.0          fastmap_1.1.1         
##  [70] yaml_2.3.7             goftest_1.2-3          fitdistrplus_1.1-8    
##  [73] purrr_1.0.1            RANN_2.6.1             pbapply_1.7-0         
##  [76] future_1.32.0          nlme_3.1-162           mime_0.12             
##  [79] compiler_4.2.0         rstudioapi_0.14        plotly_4.10.1         
##  [82] png_0.1-8              spatstat.utils_3.0-2   tibble_3.2.1          
##  [85] bslib_0.4.2            stringi_1.7.12         highr_0.10            
##  [88] lattice_0.20-45        Matrix_1.5-3           vctrs_0.6.1           
##  [91] pillar_1.9.0           lifecycle_1.0.3        spatstat.geom_3.1-0   
##  [94] lmtest_0.9-40          jquerylib_0.1.4        RcppAnnoy_0.0.20      
##  [97] data.table_1.14.8      cowplot_1.1.1          bitops_1.0-7          
## [100] irlba_2.3.5.1          httpuv_1.6.9           patchwork_1.1.2       
## [103] R6_2.5.1               promises_1.2.0.1       KernSmooth_2.23-20    
## [106] gridExtra_2.3          parallelly_1.35.0      codetools_0.2-19      
## [109] MASS_7.3-58.3          withr_2.5.0            sctransform_0.3.5     
## [112] GenomeInfoDbData_1.2.9 parallel_4.2.0         grid_4.2.0            
## [115] tidyr_1.3.0            Rtsne_0.16             spatstat.explore_3.1-0
## [118] shiny_1.7.4